--- title: mahoudata keywords: fastai sidebar: home_sidebar summary: "API details." ---

The main idea of the data processing is to compute similarities among beers. In order to do so we are gonna compute similarities with regard to:

  • Numeric ratings: include temperature
  • Description
  • Pairing
  • vajilla: tokenise and use jaccard distance

TODO:

  • Improve the remove duplicates process: There are duplicates with missing values, capitalised letters... I
  • Replace missing values with median value
  • Compute distance in text variables
#UDFs
class Preprocess:
    def __init__(self, ctx):
        self.ctx = ctx
        
    def clean_duplicates(self):
        #TODO:
        #   CHECK FOR DUPLICATES BASED ON DESCRIPTION AND ATTRIBUTES
        # REMOVE THEM
        return 1
        
    def cols_munging(self, dataframe, fillna = True):
        #Rename column
        df = dataframe.rename(columns={"Temperatura Servicio":"temperatura"})
        #Create ID for beers
        df['beerID'] = (range(1, len(df) + 1))
        df = df.set_index(df['beerID'].astype(str))
        #Move beerID to first col
        cols = df.columns.tolist()
        cols.insert(0, cols.pop(cols.index('beerID')))
        df = df.reindex(columns= cols)
        
        #fillna with 0
        #TODO: Augment to replace by median/mean
        if fillna:
            df = df.fillna(0)
        
        return df
    
    def scale_cols(self, dataframe):
        scaler = MinMaxScaler()
        df_scaled = pd.DataFrame(
            scaler.fit_transform(dataframe[self.ctx['numeric_cols']]), 
                                 columns=dataframe[self.ctx['numeric_cols']].columns
            )
        return df_scaled

class RecomenderStrategyFactory:
    def __init__(self, ctx):
        self.context = ctx
        
    def createStrategy(self, strategy):
        recommender_strategy = strategy.lower()
        
        if recommender_strategy == 'numeric':
            instance = NumericStrategy(self.context)
            
        else:
            instance = DescriptionAndNumeric(self.context)
            
        return instance
    
class NumericStrategy:
    def __init__(self, ctx):
        self.ctx = ctx
    
    def model_builder(self, dataframe):
        preprocessor = Preprocess(self.ctx)
        df = preprocessor.cols_munging(dataframe, fillna = True)
        df = preprocessor.scale_cols(df)
        return df
    
    def exec_strategy(self, dataframe, distance = 'cosine'):
        if distance == 'euclidean':
             recommender_df = pd.DataFrame(
             squareform(pdist(dataframe[self.ctx['numeric_cols']])),
             columns = dataframe.index.astype(str),
             index = dataframe.index
             )
            
        else:
            recommender_df = pd.DataFrame(
            squareform(pdist(dataframe[self.ctx['numeric_cols']], metric = 'cosine')),
            columns = dataframe.index,
            index = dataframe.index
            )
            
        return recommender_df    
        
    


        

Explore Data

df = pd.read_csv("./data/dataset-datathon.csv")
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

Remove duplicates

According to profile there are 60% duplicates. Get rid of them

df_clean = df.drop_duplicates(
#subset = df.columns.difference(['vajilla'])
)
profile = ProfileReport(df_clean, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

Run Recommender

context = {'numeric_cols' : ['lupulo_afrutado_citrico', 
                             'lupulo_floral_herbal','amargor', 'color', 
                             'maltoso', 'licoroso', 'afrutado', 'especias','acidez']
}

f = RecomenderStrategyFactory(context)

strategy = f.createStrategy('numeric')

datamodel = strategy.model_builder(df_clean)

recommender_df = strategy.exec_strategy(datamodel)

recommender_df
0 1 2 3 4 5 6 7 8 9 ... 476 477 478 479 480 481 482 483 484 485
0 0.000000 0.000000 0.042737 0.014204 0.019602 0.003507 0.046649 0.079535 0.019307 0.019307 ... 0.107993 0.039501 0.178008 0.153839 0.372661 0.048717 0.037445 0.063611 0.034118 0.033039
1 0.000000 0.000000 0.042737 0.014204 0.019602 0.003507 0.046649 0.079535 0.019307 0.019307 ... 0.107993 0.039501 0.178008 0.153839 0.372661 0.048717 0.037445 0.063611 0.034118 0.033039
2 0.042737 0.042737 0.000000 0.027731 0.111271 0.045083 0.139327 0.149810 0.109016 0.109016 ... 0.186975 0.019876 0.262107 0.083787 0.430263 0.024581 0.057169 0.110674 0.015178 0.009456
3 0.014204 0.014204 0.027731 0.000000 0.042773 0.014581 0.072928 0.073416 0.040805 0.040805 ... 0.122566 0.015192 0.194159 0.123382 0.377082 0.024367 0.027006 0.058254 0.011111 0.016657
4 0.019602 0.019602 0.111271 0.042773 0.000000 0.016331 0.029392 0.063220 0.008608 0.008608 ... 0.081491 0.092742 0.127279 0.209331 0.319723 0.095347 0.063383 0.051801 0.088428 0.089179
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
481 0.048717 0.048717 0.024581 0.024367 0.095347 0.044618 0.097419 0.099593 0.087663 0.087663 ... 0.106695 0.022456 0.219157 0.050305 0.295921 0.000000 0.027633 0.048033 0.017491 0.012491
482 0.037445 0.037445 0.057169 0.027006 0.063383 0.038345 0.038369 0.109984 0.049632 0.049632 ... 0.095466 0.046965 0.176550 0.113228 0.301670 0.027633 0.000000 0.049326 0.030405 0.046380
483 0.063611 0.063611 0.110674 0.058254 0.051801 0.046130 0.050065 0.060642 0.059130 0.059130 ... 0.027529 0.092380 0.117532 0.093103 0.154117 0.048033 0.049326 0.000000 0.089897 0.081997
484 0.034118 0.034118 0.015178 0.011111 0.088428 0.041219 0.107349 0.101764 0.073097 0.073097 ... 0.153602 0.004043 0.269176 0.114322 0.419526 0.017491 0.030405 0.089897 0.000000 0.008074
485 0.033039 0.033039 0.009456 0.016657 0.089179 0.036679 0.116344 0.099990 0.080648 0.080648 ... 0.145527 0.009101 0.260754 0.087771 0.393298 0.012491 0.046380 0.081997 0.008074 0.000000

486 rows × 486 columns

recommendations_example = pd.DataFrame(recommender_df[1].sort_values(ascending=True))
recommendations_example
1
0 0.000000
1 0.000000
452 0.000000
5 0.003507
305 0.003507
... ...
473 0.647605
142 0.660262
193 NaN
195 NaN
330 NaN

486 rows × 1 columns

In development

Below you can find work in progress

#Reshape to long form
long_form_cosine = recommender_df.unstack()

#rename columns and turn into a dataframe
long_form_cosine.index.rename(['Beer A', 'Beer B'], inplace=True)
long_form_cosine = long_form_cosine.to_frame('cosine distance').reset_index()
long_form_cosine
Beer A Beer B cosine distance
0 0 0 0.000000
1 0 1 0.000000
2 0 2 0.042737
3 0 3 0.014204
4 0 4 0.019602
... ... ... ...
236191 485 481 0.012491
236192 485 482 0.046380
236193 485 483 0.081997
236194 485 484 0.008074
236195 485 485 0.000000

236196 rows × 3 columns

Test wrangling with NLTK

#df['tokenized_desc'] = df['desc'].apply(word_tokenize)